0.3. Interim Data Initial Exploration¶
This notebook provides a brief exploration of the interim dataset obtained from youtube_trends/dataset.py and saved in data/interim/dataset.csv. This exploration was performed to determine the techniques and tools to use during data processing for future analysis. The data processing stage can also be found in youtube_trends/dataset.py.
import re
import torch
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
from IPython.display import HTML
from IPython.display import display
from scipy.stats import gaussian_kde
from sklearn.decomposition import PCA
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from youtube_trends.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR
import plotly.offline as pyo
import plotly.io as pio
pio.renderers.default = "notebook"
pyo.init_notebook_mode(connected=False)
warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Checking the quiality of data in df_train, df_val and df_test.
df_train = pd.read_csv(INTERIM_DATA_DIR / "train_dataset.csv", low_memory=False)
df_val = pd.read_csv(INTERIM_DATA_DIR / "val_dataset.csv", low_memory=False)
df_test = pd.read_csv(INTERIM_DATA_DIR / "test_dataset.csv", low_memory=False)
display(df_train)
| video_published_at | video_duration | video_view_count | video_like_count | video_comment_count | channel_view_count | channel_subscriber_count | published_dayofweek | published_hour | days_to_trend | ... | video_title_language_sv | video_title_language_sw | video_title_language_tl | video_title_language_tr | video_title_language_unknown | video_title_language_vi | video_category_pca_0 | video_category_pca_1 | video_category_pca_2 | video_category_pca_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025-03-19 12:30:12 | 60.0 | 9075151.0 | 228504.0 | 119.0 | 5.082500e+09 | 9620000.0 | 2 | 12 | 10 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.473435 | 0.714817 | -0.309635 | -0.096183 |
| 1 | 2025-03-19 12:30:12 | 60.0 | 9345171.0 | 233878.0 | 121.0 | 5.085156e+09 | 9620000.0 | 2 | 12 | 12 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.473435 | 0.714817 | -0.309635 | -0.096183 |
| 2 | 2025-03-19 12:30:12 | 60.0 | 5704659.0 | 138572.0 | 72.0 | 5.070373e+09 | 9610000.0 | 2 | 12 | 3 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.473435 | 0.714817 | -0.309635 | -0.096183 |
| 3 | 2025-03-19 12:30:12 | 60.0 | 3459131.0 | 79531.0 | 46.0 | 5.067204e+09 | 9600000.0 | 2 | 12 | 2 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.473435 | 0.714817 | -0.309635 | -0.096183 |
| 4 | 2025-03-19 12:30:12 | 60.0 | 10228951.0 | 253555.0 | 129.0 | 5.096864e+09 | 9630000.0 | 2 | 12 | 19 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.473435 | 0.714817 | -0.309635 | -0.096183 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 271640 | 2025-04-10 21:18:05 | 660.0 | 128084.0 | 3259.0 | 454.0 | 1.601386e+08 | 358000.0 | 3 | 21 | 2 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.282365 | -0.088570 | 0.739837 | -0.551516 |
| 271641 | 2025-04-10 21:18:05 | 660.0 | 98587.0 | 2777.0 | 389.0 | 1.599977e+08 | 358000.0 | 3 | 21 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.282365 | -0.088570 | 0.739837 | -0.551516 |
| 271642 | 2025-04-10 21:18:05 | 660.0 | 98412.0 | 2776.0 | 389.0 | 1.599977e+08 | 358000.0 | 3 | 21 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.282365 | -0.088570 | 0.739837 | -0.551516 |
| 271643 | 2025-04-10 21:18:05 | 660.0 | 120471.0 | 3122.0 | 440.0 | 1.600908e+08 | 358000.0 | 3 | 21 | 1 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.282365 | -0.088570 | 0.739837 | -0.551516 |
| 271644 | 2025-04-10 21:18:05 | 660.0 | 98412.0 | 2775.0 | 389.0 | 1.599977e+08 | 358000.0 | 3 | 21 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.282365 | -0.088570 | 0.739837 | -0.551516 |
271645 rows Ć 593 columns
display(df_val)
| video_published_at | video_duration | video_view_count | video_like_count | video_comment_count | channel_view_count | channel_subscriber_count | published_dayofweek | published_hour | days_to_trend | ... | video_title_language_sv | video_title_language_sw | video_title_language_tl | video_title_language_tr | video_title_language_unknown | video_title_language_vi | video_category_pca_0 | video_category_pca_1 | video_category_pca_2 | video_category_pca_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025-04-10 21:18:05 | 660.0 | 134505.0 | 3342.0 | 461.0 | 160443341.0 | 358000.0 | 3 | 21 | 5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.282365 | -0.088570 | 0.739837 | -0.551516 |
| 1 | 2025-04-10 21:18:05 | 660.0 | 134505.0 | 3342.0 | 461.0 | 160443341.0 | 358000.0 | 3 | 21 | 5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.282365 | -0.088570 | 0.739837 | -0.551516 |
| 2 | 2025-04-10 21:18:05 | 660.0 | 128080.0 | 3259.0 | 454.0 | 160138604.0 | 358000.0 | 3 | 21 | 2 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.282365 | -0.088570 | 0.739837 | -0.551516 |
| 3 | 2025-04-10 21:18:37 | 171.0 | 359909.0 | 9059.0 | 825.0 | 85008981.0 | 525000.0 | 3 | 21 | 13 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 4 | 2025-04-10 21:18:37 | 171.0 | 297385.0 | 8654.0 | 794.0 | 84050367.0 | 521000.0 | 3 | 21 | 8 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 58211 | 2025-04-17 12:01:27 | 28.0 | 1304269.0 | 30991.0 | 104.0 | 901086919.0 | 2070000.0 | 3 | 12 | 5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.245858 | -0.062659 | 0.329247 | 0.743482 |
| 58212 | 2025-04-17 12:01:33 | 62.0 | 361452.0 | 19420.0 | 117.0 | 478672644.0 | 1730000.0 | 3 | 12 | 3 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.032040 |
| 58213 | 2025-04-17 12:01:33 | 62.0 | 544466.0 | 28039.0 | 144.0 | 479543684.0 | 1730000.0 | 3 | 12 | 6 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.032040 |
| 58214 | 2025-04-17 12:01:33 | 62.0 | 553162.0 | 28363.0 | 144.0 | 479693133.0 | 1730000.0 | 3 | 12 | 7 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.032040 |
| 58215 | 2025-04-17 12:01:33 | 62.0 | 447853.0 | 23982.0 | 130.0 | 478938292.0 | 1730000.0 | 3 | 12 | 4 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.032040 |
58216 rows Ć 593 columns
display(df_test)
| video_published_at | video_duration | video_view_count | video_like_count | video_comment_count | channel_view_count | channel_subscriber_count | published_dayofweek | published_hour | days_to_trend | ... | video_title_language_sv | video_title_language_sw | video_title_language_tl | video_title_language_tr | video_title_language_unknown | video_title_language_vi | video_category_pca_0 | video_category_pca_1 | video_category_pca_2 | video_category_pca_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025-04-17 12:01:33 | 62.0 | 300117.0 | 16302.0 | 104.0 | 478357091.0 | 1730000.0 | 3 | 12 | 2 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.03204 |
| 1 | 2025-04-17 12:01:33 | 62.0 | 511110.0 | 26760.0 | 135.0 | 479297744.0 | 1730000.0 | 3 | 12 | 5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.03204 |
| 2 | 2025-04-17 12:01:33 | 62.0 | 198056.0 | 11222.0 | 68.0 | 477915510.0 | 1730000.0 | 3 | 12 | 1 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.03204 |
| 3 | 2025-04-17 12:01:35 | 1131.0 | 334192.0 | 19842.0 | 421.0 | 77013861.0 | 215000.0 | 3 | 12 | 2 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.03204 |
| 4 | 2025-04-17 12:01:35 | 1131.0 | 361084.0 | 20553.0 | 426.0 | 77194021.0 | 216000.0 | 3 | 12 | 4 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.03204 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 58214 | 2025-04-29 23:39:17 | 1200.0 | 494390.0 | 27101.0 | 1610.0 | 190363872.0 | 3270000.0 | 1 | 23 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.03204 |
| 58215 | 2025-04-29 23:39:17 | 1200.0 | 494385.0 | 27085.0 | 1610.0 | 190363872.0 | 3270000.0 | 1 | 23 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.03204 |
| 58216 | 2025-04-29 23:39:17 | 1200.0 | 494385.0 | 27085.0 | 1610.0 | 190363872.0 | 3270000.0 | 1 | 23 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.03204 |
| 58217 | 2025-04-29 23:39:17 | 1200.0 | 494385.0 | 27082.0 | 1610.0 | 190363872.0 | 3270000.0 | 1 | 23 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.03204 |
| 58218 | 2025-04-29 23:39:17 | 1200.0 | 494385.0 | 27082.0 | 1610.0 | 190363872.0 | 3270000.0 | 1 | 23 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.03204 |
58219 rows Ć 593 columns
for column in df_train.columns:
print(column)
video_published_at video_duration video_view_count video_like_count video_comment_count channel_view_count channel_subscriber_count published_dayofweek published_hour days_to_trend video_title_length video_tag_count thumbnail_brightness thumbnail_contrast thumbnail_saturation thumb_pca_0 thumb_pca_1 thumb_pca_2 thumb_pca_3 thumb_pca_4 thumb_pca_5 thumb_pca_6 thumb_pca_7 thumb_pca_8 thumb_pca_9 thumb_pca_10 thumb_pca_11 thumb_pca_12 thumb_pca_13 thumb_pca_14 thumb_pca_15 thumb_pca_16 thumb_pca_17 thumb_pca_18 thumb_pca_19 thumb_pca_20 thumb_pca_21 thumb_pca_22 thumb_pca_23 thumb_pca_24 thumb_pca_25 thumb_pca_26 thumb_pca_27 thumb_pca_28 thumb_pca_29 thumb_pca_30 thumb_pca_31 thumb_pca_32 thumb_pca_33 thumb_pca_34 thumb_pca_35 thumb_pca_36 thumb_pca_37 thumb_pca_38 thumb_pca_39 video_title_clean video_title_translated 10 100 1000 10000 100000 11 12 14 1446 15 150 19 20 202425 2025 2026 21 210m 22 24 2425 25 26 27 28 30 41 4k abandonado action acts actually ad aerated afford after ahead al all an and andreygrechka animation anticipates apple appliances april arcane are argentina ariana arrire arsenal as at ate baby babys back bad ball banana barcelona be beauty ben beneagle benny best big black blackout blanco blind blm bomb bought brasil breathe brighter bro brother bruises bunker but button by cake can candy car carrin casa cat catch cats challenge champions cheating chefkoudy chinese cinema city clash clasific cleaned cleaning clip coat code coin color comedy con concrete cool copa couldnt couple covered cup cut cyrus da date davidmatthew day days de deadliest del demariki der di diamond dice did die different dinner diy do dog don dont eagle earth el eladio elderly eliminatorias embarazada emilia en end enemy ep episode episodio eref error es escape escaping escondite esta este ever every everyone extended fake familia family farthest fasten father fc feat fecha few fiesta final find first flexible floating floor flying football for fortnite free friend from ft fucking fue fui full funny gadgets game gaspi gelato gift girl gmgolf goal goldman gomez goodman gorillatough got gotta grand grande gravity guess guy hack had happy has have he her heres highlights his history hold home horas hottest house how humanities ibai ich identical if ilusion im in incredible indestructible into iq is ishowspeed it italians items je jhope juego just kbrn kid kids kind kindness kitchen kungfu la las last late latest lazy le league les let libertybarros life like lisa little live lo looks los love lyric ma made madrid magic makeup man mans march marwan match me members messed mi miami miley minecraft mis mit mona more most move movie movies ms mujeres mundial mundo music mv my na nations nevada never new next nicki nicole nintendo no noche not notice novayaeracomig now of off official oficial on one or other our out own pablo para park part pas password pennies pisode places play polinesios poppy por possible pov prank prepared prix prod putty que race reacciona real relateisabellaafro relationship remember remix respect resumen rey right ronaldo room rya said school screw se season secret secreto securely see selena serves sharing she shopping shorts shortsfeed show si sidemen sigma sisters skill smart smoke so soccer solo sorry speedmcqueen1 speeds split sprunki squid stanley stop story stray stromae su sub sudamericanas sumo super sure survived switch take talk tape teaser temporada than that the theory they this time tini tips to todo too tools top tour trailer training trending trick truman trump try trying turned tv twin uefa un una unexpected up us utensils via victoriapfeifer video villain viral visualizer vlog vs vuelta was way we week what whats when which who why will win wins with woman work worker world worlds wrestler xeneize year years yes you your youtuber youtubers video_title_language_af video_title_language_ca video_title_language_cs video_title_language_cy video_title_language_da video_title_language_de video_title_language_en video_title_language_es video_title_language_et video_title_language_fi video_title_language_fr video_title_language_hr video_title_language_hu video_title_language_id video_title_language_it video_title_language_lt video_title_language_lv video_title_language_nl video_title_language_no video_title_language_pl video_title_language_pt video_title_language_ro video_title_language_sk video_title_language_sl video_title_language_so video_title_language_sq video_title_language_sv video_title_language_sw video_title_language_tl video_title_language_tr video_title_language_unknown video_title_language_vi video_category_pca_0 video_category_pca_1 video_category_pca_2 video_category_pca_3
Removing duplicated values and recreating df_train, df_val and df_test.
df = pd.concat([df_train, df_val, df_test], axis=0, ignore_index=True)
df = df.drop_duplicates()
df = df.sort_values(by='video_published_at')
train_end = int(len(df) * 0.7)
val_end = int(len(df) * 0.85)
df_train = df.iloc[:train_end]
df_val = df.iloc[train_end:val_end]
df_test = df.iloc[val_end:]
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
display(df_train)
| video_published_at | video_duration | video_view_count | video_like_count | video_comment_count | channel_view_count | channel_subscriber_count | published_dayofweek | published_hour | days_to_trend | ... | video_title_language_sv | video_title_language_sw | video_title_language_tl | video_title_language_tr | video_title_language_unknown | video_title_language_vi | video_category_pca_0 | video_category_pca_1 | video_category_pca_2 | video_category_pca_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025-03-19 12:30:12 | 60.0 | 9075151.0 | 228504.0 | 119.0 | 5.082500e+09 | 9620000.0 | 2 | 12 | 10 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.473435 | 0.714817 | -0.309635 | -0.096183 |
| 1 | 2025-03-19 12:30:12 | 60.0 | 9431677.0 | 235578.0 | 122.0 | 5.086626e+09 | 9620000.0 | 2 | 12 | 13 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.473435 | 0.714817 | -0.309635 | -0.096183 |
| 2 | 2025-03-19 12:30:12 | 60.0 | 8578672.0 | 217814.0 | 112.0 | 5.077640e+09 | 9620000.0 | 2 | 12 | 6 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.473435 | 0.714817 | -0.309635 | -0.096183 |
| 3 | 2025-03-19 12:30:12 | 60.0 | 10305804.0 | 255262.0 | 129.0 | 5.098385e+09 | 9630000.0 | 2 | 12 | 20 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.473435 | 0.714817 | -0.309635 | -0.096183 |
| 4 | 2025-03-19 12:30:12 | 60.0 | 9546141.0 | 238542.0 | 123.0 | 5.087882e+09 | 9620000.0 | 2 | 12 | 14 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.473435 | 0.714817 | -0.309635 | -0.096183 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 130222 | 2025-04-11 09:00:07 | 135.0 | 853255.0 | 15395.0 | 7145.0 | 1.220683e+08 | 162000.0 | 4 | 9 | 17 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 130223 | 2025-04-11 09:00:07 | 143.0 | 506394.0 | 6333.0 | 264.0 | 9.929313e+08 | 822000.0 | 4 | 9 | 5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 130224 | 2025-04-11 09:00:07 | 143.0 | 646582.0 | 6938.0 | 277.0 | 9.936595e+08 | 822000.0 | 4 | 9 | 8 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 130225 | 2025-04-11 09:00:07 | 143.0 | 506386.0 | 6333.0 | 264.0 | 9.929313e+08 | 822000.0 | 4 | 9 | 5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 130226 | 2025-04-11 09:00:07 | 143.0 | 973723.0 | 7970.0 | 303.0 | 9.956019e+08 | 823000.0 | 4 | 9 | 17 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
130227 rows Ć 593 columns
display(df_val)
| video_published_at | video_duration | video_view_count | video_like_count | video_comment_count | channel_view_count | channel_subscriber_count | published_dayofweek | published_hour | days_to_trend | ... | video_title_language_sv | video_title_language_sw | video_title_language_tl | video_title_language_tr | video_title_language_unknown | video_title_language_vi | video_category_pca_0 | video_category_pca_1 | video_category_pca_2 | video_category_pca_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025-04-11 09:00:07 | 135.0 | 190418.0 | 11268.0 | 6282.0 | 116893584.0 | 159000.0 | 4 | 9 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 1 | 2025-04-11 09:00:07 | 135.0 | 517706.0 | 14028.0 | 7044.0 | 119496239.0 | 160000.0 | 4 | 9 | 5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 2 | 2025-04-11 09:00:07 | 135.0 | 429518.0 | 13454.0 | 6898.0 | 118139416.0 | 160000.0 | 4 | 9 | 3 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 3 | 2025-04-11 09:00:07 | 143.0 | 799801.0 | 7468.0 | 285.0 | 994502818.0 | 822000.0 | 4 | 9 | 12 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 4 | 2025-04-11 09:00:07 | 143.0 | 799811.0 | 7468.0 | 285.0 | 994502818.0 | 822000.0 | 4 | 9 | 12 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27902 | 2025-04-17 17:00:06 | 166.0 | 89108.0 | 4467.0 | 630.0 | 81058112.0 | 142000.0 | 3 | 17 | 2 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 27903 | 2025-04-17 17:00:06 | 4334.0 | 87576.0 | 2972.0 | 135.0 | 57241349.0 | 128000.0 | 3 | 17 | 7 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.231205 | -0.054801 | 0.258836 | 0.345655 |
| 27904 | 2025-04-17 17:00:06 | 4334.0 | 92139.0 | 3032.0 | 137.0 | 57311462.0 | 128000.0 | 3 | 17 | 10 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.231205 | -0.054801 | 0.258836 | 0.345655 |
| 27905 | 2025-04-17 17:00:06 | 4334.0 | 89306.0 | 2989.0 | 135.0 | 57269882.0 | 128000.0 | 3 | 17 | 8 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.231205 | -0.054801 | 0.258836 | 0.345655 |
| 27906 | 2025-04-17 17:00:06 | 166.0 | 118059.0 | 4847.0 | 680.0 | 81058112.0 | 142000.0 | 3 | 17 | 4 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
27907 rows Ć 593 columns
display(df_test)
| video_published_at | video_duration | video_view_count | video_like_count | video_comment_count | channel_view_count | channel_subscriber_count | published_dayofweek | published_hour | days_to_trend | ... | video_title_language_sv | video_title_language_sw | video_title_language_tl | video_title_language_tr | video_title_language_unknown | video_title_language_vi | video_category_pca_0 | video_category_pca_1 | video_category_pca_2 | video_category_pca_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025-04-17 17:00:06 | 166.0 | 169198.0 | 5478.0 | 729.0 | 81905201.0 | 142000.0 | 3 | 17 | 11 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 1 | 2025-04-17 17:00:06 | 4334.0 | 94562.0 | 3066.0 | 139.0 | 57348201.0 | 128000.0 | 3 | 17 | 12 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.231205 | -0.054801 | 0.258836 | 0.345655 |
| 2 | 2025-04-17 17:00:08 | 171.0 | 1108951.0 | 13607.0 | 313.0 | 394182210.0 | 393000.0 | 3 | 17 | 11 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 3 | 2025-04-17 17:00:08 | 171.0 | 1182751.0 | 13976.0 | 319.0 | 394182210.0 | 393000.0 | 3 | 17 | 12 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| 4 | 2025-04-17 17:00:08 | 171.0 | 234037.0 | 7686.0 | 213.0 | 390087497.0 | 391000.0 | 3 | 17 | 1 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.411025 | -0.690400 | -0.454210 | -0.118053 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27901 | 2025-04-29 23:38:45 | 2019.0 | 72380.0 | 5500.0 | 387.0 | 23682586.0 | 166000.0 | 1 | 23 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.032040 |
| 27902 | 2025-04-29 23:39:17 | 1200.0 | 494385.0 | 27085.0 | 1610.0 | 190363872.0 | 3270000.0 | 1 | 23 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.032040 |
| 27903 | 2025-04-29 23:39:17 | 1200.0 | 494385.0 | 27085.0 | 1610.0 | 190363872.0 | 3270000.0 | 1 | 23 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.032040 |
| 27904 | 2025-04-29 23:39:17 | 1200.0 | 494390.0 | 27101.0 | 1610.0 | 190363872.0 | 3270000.0 | 1 | 23 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.032040 |
| 27905 | 2025-04-29 23:39:17 | 1200.0 | 494385.0 | 27082.0 | 1610.0 | 190363872.0 | 3270000.0 | 1 | 23 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.738773 | 0.031086 | -0.069742 | -0.032040 |
27906 rows Ć 593 columns
Title columns¶
df_train_title = df_train.filter(like='video_title_')
for column in list(df_train_title.columns):
print(column)
video_title_length video_title_clean video_title_translated video_title_language_af video_title_language_ca video_title_language_cs video_title_language_cy video_title_language_da video_title_language_de video_title_language_en video_title_language_es video_title_language_et video_title_language_fi video_title_language_fr video_title_language_hr video_title_language_hu video_title_language_id video_title_language_it video_title_language_lt video_title_language_lv video_title_language_nl video_title_language_no video_title_language_pl video_title_language_pt video_title_language_ro video_title_language_sk video_title_language_sl video_title_language_so video_title_language_sq video_title_language_sv video_title_language_sw video_title_language_tl video_title_language_tr video_title_language_unknown video_title_language_vi
df_train = df_train.drop(['video_title_clean', 'video_title_translated'], axis=1)
df_val = df_val.drop(['video_title_clean', 'video_title_translated'], axis=1)
df_test = df_test.drop(['video_title_clean', 'video_title_translated'], axis=1)
df_train_title = df_train.filter(like='video_title_')
for column in list(df_train_title.columns):
print(column)
video_title_length video_title_language_af video_title_language_ca video_title_language_cs video_title_language_cy video_title_language_da video_title_language_de video_title_language_en video_title_language_es video_title_language_et video_title_language_fi video_title_language_fr video_title_language_hr video_title_language_hu video_title_language_id video_title_language_it video_title_language_lt video_title_language_lv video_title_language_nl video_title_language_no video_title_language_pl video_title_language_pt video_title_language_ro video_title_language_sk video_title_language_sl video_title_language_so video_title_language_sq video_title_language_sv video_title_language_sw video_title_language_tl video_title_language_tr video_title_language_unknown video_title_language_vi
def plot_distribution(df, column, color="#636EFA"):
values = df[column].dropna()
kde = gaussian_kde(values)
x_range = np.linspace(values.min(), values.max(), 200)
y_values = kde(x_range)
fig = make_subplots(
rows=2, cols=1,
shared_xaxes=True,
row_heights=[0.7, 0.3],
vertical_spacing=0.05,
subplot_titles=(f"{column} - KDE", f"{column} - Boxplot")
)
fig.add_trace(go.Histogram(
x=values,
name=f"{column} Histograma",
marker_color=color,
opacity=0.75
), row=1, col=1)
fig.add_trace(go.Box(
x=values,
name=f"{column} Boxplot",
marker_color=color,
boxmean=True,
orientation='h'
), row=2, col=1)
fig.update_layout(
height=500,
width=700,
title_text=f"Distribution and Boxplot for {column}",
template="plotly_white"
)
fig.update_xaxes(title_text="Value", row=2, col=1)
fig.update_yaxes(title_text="Density", row=1, col=1)
fig.show()
plot_distribution(df_train_title, 'video_title_length')
df_train_title = df_train_title.drop(['video_title_length'], axis=1)
train_lang_videos = df_train_title.sum(numeric_only=True).copy()
train_lang_videos = train_lang_videos[train_lang_videos.index.str.startswith('video_title_language_')]
train_lang_videos.index = train_lang_videos.index.str.replace('video_title_language_', '', regex=False)
df_train_lang = train_lang_videos.reset_index()
df_train_lang.columns = ['language', 'videos']
df_train_lang = df_train_lang.sort_values(by='videos', ascending=False)
fig = px.bar(df_train_lang, x='language', y='videos', title='Amount of videos per detected language')
fig.update_layout(xaxis_title='Language', yaxis_title='Videos', template='plotly_white')
fig.show()
df_train_lang['percentage'] = round(df_train_lang['videos'] / len(df_train_title), 2)
fig = px.bar(df_train_lang, x='language', y='percentage', title='Percentage of videos in each detected language')
fig.update_layout(xaxis_title='Language', yaxis_title='Videos', template='plotly_white')
fig.show()
PCA will be applied to reduced dimensionality, since there are some detected languages that are neglilible.
def reduce_language_pca(df_train, df_val, df_test, pca_variance_target=0.8, pca_max_components=10):
lang_cols = [col for col in df_train.columns if str(col).startswith('video_title_language_')]
df_train = df_train.dropna(subset=lang_cols)
df_val = df_val.dropna(subset=lang_cols)
df_test = df_test.dropna(subset=lang_cols)
X_train = df_train[lang_cols].values
X_val = df_val[lang_cols].values
X_test = df_test[lang_cols].values
cumulative = np.cumsum(PCA().fit(X_train).explained_variance_ratio_)
n_components = np.argmax(cumulative >= pca_variance_target) + 1
n_components = min(pca_max_components, n_components)
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)
pca_cols = [f'lang_pca_{i}' for i in range(n_components)]
df_train_pca = pd.DataFrame(X_train_pca, columns=pca_cols, index=df_train.index)
df_val_pca = pd.DataFrame(X_val_pca, columns=pca_cols, index=df_val.index)
df_test_pca = pd.DataFrame(X_test_pca, columns=pca_cols, index=df_test.index)
df_train = pd.concat([df_train.drop(columns=lang_cols), df_train_pca], axis=1)
df_val = pd.concat([df_val.drop(columns=lang_cols), df_val_pca], axis=1)
df_test = pd.concat([df_test.drop(columns=lang_cols), df_test_pca], axis=1)
return df_train, df_val, df_test, pca
df_train, df_val, df_test, language_pca = reduce_language_pca(df_train, df_val, df_test)
display(df_train.filter(like='lang_pca_'))
| lang_pca_0 | lang_pca_1 | lang_pca_2 | lang_pca_3 | lang_pca_4 | lang_pca_5 | lang_pca_6 | lang_pca_7 | lang_pca_8 | lang_pca_9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| 1 | -0.645268 | 0.691893 | 0.044201 | 0.002208 | -0.006004 | -0.011748 | 0.015098 | 0.009484 | 0.004019 | 0.000021 |
| 2 | -0.645268 | 0.691893 | 0.044201 | 0.002208 | -0.006004 | -0.011748 | 0.015098 | 0.009484 | 0.004019 | 0.000021 |
| 3 | -0.645268 | 0.691893 | 0.044201 | 0.002208 | -0.006004 | -0.011748 | 0.015098 | 0.009484 | 0.004019 | 0.000021 |
| 4 | -0.645268 | 0.691893 | 0.044201 | 0.002208 | -0.006004 | -0.011748 | 0.015098 | 0.009484 | 0.004019 | 0.000021 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 130222 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| 130223 | -0.333936 | -0.285876 | -0.177380 | -0.034585 | 0.122728 | 0.717304 | 0.586194 | 0.149858 | 0.044043 | 0.000200 |
| 130224 | -0.333936 | -0.285876 | -0.177380 | -0.034585 | 0.122728 | 0.717304 | 0.586194 | 0.149858 | 0.044043 | 0.000200 |
| 130225 | -0.333936 | -0.285876 | -0.177380 | -0.034585 | 0.122728 | 0.717304 | 0.586194 | 0.149858 | 0.044043 | 0.000200 |
| 130226 | -0.333936 | -0.285876 | -0.177380 | -0.034585 | 0.122728 | 0.717304 | 0.586194 | 0.149858 | 0.044043 | 0.000200 |
130227 rows Ć 10 columns
display(df_val.filter(like='lang_pca_'))
| lang_pca_0 | lang_pca_1 | lang_pca_2 | lang_pca_3 | lang_pca_4 | lang_pca_5 | lang_pca_6 | lang_pca_7 | lang_pca_8 | lang_pca_9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| 1 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| 2 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| 3 | -0.333936 | -0.285876 | -0.177380 | -0.034585 | 0.122728 | 0.717304 | 0.586194 | 0.149858 | 0.044043 | 0.000200 |
| 4 | -0.333936 | -0.285876 | -0.177380 | -0.034585 | 0.122728 | 0.717304 | 0.586194 | 0.149858 | 0.044043 | 0.000200 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27902 | -0.325397 | -0.265472 | -0.139378 | -0.018144 | 0.056656 | 0.163949 | -0.554448 | 0.768687 | 0.096090 | 0.000366 |
| 27903 | -0.645268 | 0.691893 | 0.044201 | 0.002208 | -0.006004 | -0.011748 | 0.015098 | 0.009484 | 0.004019 | 0.000021 |
| 27904 | -0.645268 | 0.691893 | 0.044201 | 0.002208 | -0.006004 | -0.011748 | 0.015098 | 0.009484 | 0.004019 | 0.000021 |
| 27905 | -0.645268 | 0.691893 | 0.044201 | 0.002208 | -0.006004 | -0.011748 | 0.015098 | 0.009484 | 0.004019 | 0.000021 |
| 27906 | -0.325397 | -0.265472 | -0.139378 | -0.018144 | 0.056656 | 0.163949 | -0.554448 | 0.768687 | 0.096090 | 0.000366 |
27907 rows Ć 10 columns
display(df_test.filter(like='lang_pca_'))
| lang_pca_0 | lang_pca_1 | lang_pca_2 | lang_pca_3 | lang_pca_4 | lang_pca_5 | lang_pca_6 | lang_pca_7 | lang_pca_8 | lang_pca_9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.325397 | -0.265472 | -0.139378 | -0.018144 | 0.056656 | 0.163949 | -0.554448 | 0.768687 | 0.096090 | 0.000366 |
| 1 | -0.645268 | 0.691893 | 0.044201 | 0.002208 | -0.006004 | -0.011748 | 0.015098 | 0.009484 | 0.004019 | 0.000021 |
| 2 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| 3 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| 4 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27901 | -0.343326 | -0.310771 | -0.247795 | -0.622538 | -0.569655 | -0.284901 | 0.186440 | 0.081487 | 0.028151 | 0.000135 |
| 27902 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| 27903 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| 27904 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
| 27905 | 0.630230 | 0.082127 | 0.011908 | 0.000667 | -0.001832 | -0.003664 | 0.004847 | 0.003126 | 0.001352 | 0.000007 |
27906 rows Ć 10 columns
Thumbnail stats columns¶
df_train_stats = df_train.filter(like='thumbnail_')
display(df_train_stats)
| thumbnail_brightness | thumbnail_contrast | thumbnail_saturation | |
|---|---|---|---|
| 0 | 0.401444 | 0.490360 | 0.173942 |
| 1 | 0.401444 | 0.490360 | 0.173942 |
| 2 | 0.401444 | 0.490360 | 0.173942 |
| 3 | 0.401444 | 0.490360 | 0.173942 |
| 4 | 0.401444 | 0.490360 | 0.173942 |
| ... | ... | ... | ... |
| 130222 | 0.383354 | 0.563104 | 0.406289 |
| 130223 | 0.107356 | 0.373082 | 0.279782 |
| 130224 | 0.107356 | 0.373082 | 0.279782 |
| 130225 | 0.107356 | 0.373082 | 0.279782 |
| 130226 | 0.107356 | 0.373082 | 0.279782 |
130227 rows Ć 3 columns
df_train_stats.describe()
| thumbnail_brightness | thumbnail_contrast | thumbnail_saturation | |
|---|---|---|---|
| count | 130227.000000 | 130227.000000 | 130227.000000 |
| mean | 0.405420 | 0.569546 | 0.401801 |
| std | 0.130601 | 0.125620 | 0.153554 |
| min | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.327867 | 0.489312 | 0.293722 |
| 50% | 0.405627 | 0.568635 | 0.401346 |
| 75% | 0.480718 | 0.659145 | 0.502840 |
| max | 1.000000 | 1.000000 | 1.000000 |
for stat in df_train_stats.columns:
plot_distribution(df_train_stats, stat)